|
|
|
@ -1,9 +1,9 @@
|
|
|
|
|
package com.bjtds.brichat.service.dify.impl;
|
|
|
|
|
|
|
|
|
|
import cn.hutool.json.JSONUtil;
|
|
|
|
|
import com.bjtds.brichat.entity.dataset.DocumentUploadReq;
|
|
|
|
|
import com.bjtds.brichat.entity.dataset.RetrievalModel;
|
|
|
|
|
import com.bjtds.brichat.entity.dify.DatasetDto;
|
|
|
|
|
import com.bjtds.brichat.entity.dify.DifyDatasetResponse;
|
|
|
|
|
import com.bjtds.brichat.entity.dify.*;
|
|
|
|
|
import com.bjtds.brichat.entity.dto.PdfConversionResponse;
|
|
|
|
|
import com.bjtds.brichat.entity.dto.PdfTaskDto;
|
|
|
|
|
import com.bjtds.brichat.mapper.postgresql.DifyDatasetsMapper;
|
|
|
|
@ -14,11 +14,17 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
|
|
import io.github.guoshiqiufeng.dify.dataset.DifyDataset;
|
|
|
|
|
import io.github.guoshiqiufeng.dify.dataset.dto.request.DatasetInfoRequest;
|
|
|
|
|
import io.github.guoshiqiufeng.dify.dataset.dto.response.DatasetInfoResponse;
|
|
|
|
|
import io.github.guoshiqiufeng.dify.dataset.dto.response.UploadFileInfoResponse;
|
|
|
|
|
import org.apache.commons.io.IOUtils;
|
|
|
|
|
import org.apache.poi.hwpf.HWPFDocument;
|
|
|
|
|
import org.apache.poi.hwpf.extractor.WordExtractor;
|
|
|
|
|
import org.apache.poi.hwpf.usermodel.CharacterRun;
|
|
|
|
|
import org.apache.poi.hwpf.usermodel.Paragraph;
|
|
|
|
|
import org.apache.poi.hwpf.usermodel.Range;
|
|
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFDocument;
|
|
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
|
|
|
|
|
import org.apache.poi.xwpf.usermodel.XWPFRun;
|
|
|
|
|
import org.docx4j.openpackaging.exceptions.Docx4JException;
|
|
|
|
|
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
|
|
|
|
|
import org.slf4j.Logger;
|
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
@ -37,9 +43,7 @@ import org.springframework.web.client.RestTemplate;
|
|
|
|
|
import org.springframework.web.multipart.MultipartFile;
|
|
|
|
|
|
|
|
|
|
import javax.annotation.Resource;
|
|
|
|
|
import java.io.ByteArrayOutputStream;
|
|
|
|
|
import java.io.File;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.io.*;
|
|
|
|
|
import java.util.*;
|
|
|
|
|
import java.util.concurrent.TimeUnit;
|
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
@ -85,6 +89,104 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
@Resource
|
|
|
|
|
private DifyDatasetsMapper difyDatasetsMapper;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public ResponseEntity<DifyMetadata> addMetadata(String datasetId, DifyMetadata metadata) {
|
|
|
|
|
// 1. 设置请求URL
|
|
|
|
|
String url = difyUrl+ Constants.DATABASE_API + "/" + datasetId + "/metadata";
|
|
|
|
|
|
|
|
|
|
// 2. 设置请求头
|
|
|
|
|
HttpHeaders headers = new HttpHeaders();
|
|
|
|
|
headers.set("Authorization", Constants.BEARER +apiKey); // Bearer认证
|
|
|
|
|
headers.setContentType(MediaType.APPLICATION_JSON); // 设置JSON类型
|
|
|
|
|
|
|
|
|
|
// 3. 构建请求体(包含name和description的JSON对象)
|
|
|
|
|
Map<String, String> requestBody = new HashMap<>();
|
|
|
|
|
requestBody.put("type", metadata.getType());
|
|
|
|
|
requestBody.put("name", metadata.getName());
|
|
|
|
|
// 4. 创建包含请求体和头的HttpEntity
|
|
|
|
|
HttpEntity<Map<String, String>> requestEntity = new HttpEntity<>(requestBody, headers);
|
|
|
|
|
|
|
|
|
|
// 5. 发送POST请求
|
|
|
|
|
return restTemplate.exchange(
|
|
|
|
|
url,
|
|
|
|
|
HttpMethod.POST,
|
|
|
|
|
requestEntity,
|
|
|
|
|
new ParameterizedTypeReference<DifyMetadata>() {} // 解决泛型类型擦除问题
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public ResponseEntity<DifyGetMetadatasRes> getMetadatas(String datasetId) {
|
|
|
|
|
|
|
|
|
|
// 1. 设置请求URL
|
|
|
|
|
String url = difyUrl+ Constants.DATABASE_API + "/" + datasetId + "/metadata";
|
|
|
|
|
|
|
|
|
|
// 2. 设置请求头
|
|
|
|
|
HttpHeaders headers = new HttpHeaders();
|
|
|
|
|
headers.set("Authorization", Constants.BEARER +apiKey); // Bearer认证
|
|
|
|
|
headers.setContentType(MediaType.APPLICATION_JSON); // 设置JSON类型
|
|
|
|
|
HttpEntity<?> requestEntity = new HttpEntity<>(headers);
|
|
|
|
|
return restTemplate.exchange(
|
|
|
|
|
url,
|
|
|
|
|
HttpMethod.GET,
|
|
|
|
|
requestEntity,
|
|
|
|
|
new ParameterizedTypeReference<DifyGetMetadatasRes>() {} // 解决泛型类型擦除问题
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public ResponseEntity<Map> setMetadataForDoc(String datasetId, DifyMatedataAnnoReq metadataAnnoReq) {
|
|
|
|
|
|
|
|
|
|
// 1. 设置请求URL
|
|
|
|
|
String url = difyUrl+ Constants.DATABASE_API + "/" + datasetId + "/documents/metadata";
|
|
|
|
|
// 2. 设置请求头
|
|
|
|
|
HttpHeaders headers = new HttpHeaders();
|
|
|
|
|
headers.set("Authorization", Constants.BEARER +apiKey); // Bearer认证
|
|
|
|
|
headers.setContentType(MediaType.APPLICATION_JSON); // 设置JSON类型
|
|
|
|
|
|
|
|
|
|
// 3. 构建请求体
|
|
|
|
|
Map<String, Object> requestBody = new HashMap<>();
|
|
|
|
|
|
|
|
|
|
// 构建operation_data数组 - 修复:应该是数组格式
|
|
|
|
|
List<DifyMatedataAnnoReq> operationDataList = new ArrayList<>();
|
|
|
|
|
operationDataList.add(metadataAnnoReq);
|
|
|
|
|
requestBody.put("operation_data", operationDataList);
|
|
|
|
|
|
|
|
|
|
// 4. 创建包含请求体和头的HttpEntity
|
|
|
|
|
HttpEntity<Map<String, Object>> requestEntity = new HttpEntity<>(requestBody, headers);
|
|
|
|
|
|
|
|
|
|
// 5. 发送POST请求 - 修复:使用String类型接收响应,然后手动处理
|
|
|
|
|
try {
|
|
|
|
|
ResponseEntity<String> response = restTemplate.exchange(
|
|
|
|
|
url,
|
|
|
|
|
HttpMethod.POST,
|
|
|
|
|
requestEntity,
|
|
|
|
|
String.class
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// 手动构建返回的Map
|
|
|
|
|
Map<String, Object> resultMap = new HashMap<>();
|
|
|
|
|
resultMap.put("success", true);
|
|
|
|
|
resultMap.put("message", "元数据设置成功");
|
|
|
|
|
resultMap.put("response", response.getBody());
|
|
|
|
|
|
|
|
|
|
return ResponseEntity.status(response.getStatusCode()).body(resultMap);
|
|
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
logger.error("设置文档元数据失败: {}", e.getMessage(), e);
|
|
|
|
|
Map<String, Object> errorMap = new HashMap<>();
|
|
|
|
|
errorMap.put("success", false);
|
|
|
|
|
errorMap.put("message", "元数据设置失败: " + e.getMessage());
|
|
|
|
|
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMap);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public ResponseEntity<DatasetDto> createDataset(String name, String description) {
|
|
|
|
|
// 1. 设置请求URL
|
|
|
|
@ -190,6 +292,9 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
// 2. 执行正常文件上传逻辑
|
|
|
|
|
return handleNormalFile(request, file);
|
|
|
|
|
}
|
|
|
|
|
//3 为文件标注预览路径的元数据
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
@ -306,18 +411,62 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
|
|
|
|
|
// 发送请求
|
|
|
|
|
try {
|
|
|
|
|
return restTemplate.exchange(
|
|
|
|
|
ResponseEntity<Map> exchange = restTemplate.exchange(
|
|
|
|
|
url,
|
|
|
|
|
HttpMethod.POST,
|
|
|
|
|
requestEntity,
|
|
|
|
|
Map.class,
|
|
|
|
|
uriVariables
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
//3. 为文档添加元数据
|
|
|
|
|
Map<String,String> document = ( Map<String,String> ) exchange.getBody().get("document");
|
|
|
|
|
String documentId = document.get("id");
|
|
|
|
|
|
|
|
|
|
UploadFileInfoResponse uploadFileInfoResponse = difyDatasetService.uploadFileInfo(request.getDatasetId(), documentId);
|
|
|
|
|
String urls = uploadFileInfoResponse.getUrl();
|
|
|
|
|
String fullUrl = difyUrl + urls;
|
|
|
|
|
setSourceUrlMatedata(request.getDatasetId(),documentId,fullUrl);
|
|
|
|
|
|
|
|
|
|
return exchange;
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
throw new RuntimeException("文档上传失败: " + e.getMessage(), e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private void setSourceUrlMatedata(String datasetId,String documentId,String sourceUrl){
|
|
|
|
|
try {
|
|
|
|
|
//1. 查询知识库是否存在source元数据
|
|
|
|
|
ResponseEntity<DifyGetMetadatasRes> res = getMetadatas(datasetId);
|
|
|
|
|
|
|
|
|
|
List<DifyMetadata> docMetadatas = res.getBody().getDocMetadatas();
|
|
|
|
|
DifyMetadata metadata = null;
|
|
|
|
|
DifyMatedataAnnoReq metadataAnnoReq = new DifyMatedataAnnoReq();
|
|
|
|
|
metadataAnnoReq.setDocumentId(documentId);
|
|
|
|
|
if (docMetadatas == null || docMetadatas.isEmpty() ||docMetadatas.stream().noneMatch(m -> "source_url".equals(m.getName()))) {
|
|
|
|
|
metadata = addMetadata(datasetId, DifyMetadata.builder().type("string").name("source_url").build()).getBody();
|
|
|
|
|
}
|
|
|
|
|
if (metadata == null) {
|
|
|
|
|
metadata = docMetadatas.stream().map(m -> {
|
|
|
|
|
if ("source_url".equals(m.getName())) {
|
|
|
|
|
m.setValue(sourceUrl);
|
|
|
|
|
}
|
|
|
|
|
return m;
|
|
|
|
|
}).findFirst().orElse(null);
|
|
|
|
|
}
|
|
|
|
|
metadata.setValue(sourceUrl);
|
|
|
|
|
metadataAnnoReq.setMetadataList(Collections.singletonList(metadata));
|
|
|
|
|
ResponseEntity<Map> result = setMetadataForDoc(datasetId, metadataAnnoReq);
|
|
|
|
|
logger.info("为文档 {} 设置source_url元数据成功", documentId);
|
|
|
|
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
|
// 元数据设置失败不应该影响文档上传的主流程
|
|
|
|
|
logger.warn("为文档 {} 设置source_url元数据失败,但文档上传成功: {}", documentId, e.getMessage());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* 调用PDF转换服务
|
|
|
|
|
*/
|
|
|
|
@ -414,8 +563,8 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
|
|
|
|
|
// 读取转换后的文件内容 - Java 8兼容
|
|
|
|
|
byte[] docxBytes;
|
|
|
|
|
try (java.io.FileInputStream fis = new java.io.FileInputStream(tempFile)) {
|
|
|
|
|
docxBytes = org.apache.commons.io.IOUtils.toByteArray(fis);
|
|
|
|
|
try (FileInputStream fis = new FileInputStream(tempFile)) {
|
|
|
|
|
docxBytes = IOUtils.toByteArray(fis);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
logger.info("docx4j转换成功: {} -> {} (大小: {} -> {} bytes)",
|
|
|
|
@ -440,7 +589,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} catch (org.docx4j.openpackaging.exceptions.Docx4JException e) {
|
|
|
|
|
} catch (Docx4JException e) {
|
|
|
|
|
logger.warn("docx4j无法识别文件格式,可能是较旧的DOC格式: {}", e.getMessage());
|
|
|
|
|
return handleOlderDocFormat(docFile, originalFilename);
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
@ -520,7 +669,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
// 读取DOC文档
|
|
|
|
|
try (java.io.InputStream inputStream = docFile.getInputStream()) {
|
|
|
|
|
try (InputStream inputStream = docFile.getInputStream()) {
|
|
|
|
|
docDocument = new HWPFDocument(inputStream);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -561,7 +710,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
private void convertDocContentToDocx(HWPFDocument docDocument, XWPFDocument docxDocument) {
|
|
|
|
|
try {
|
|
|
|
|
// 获取文档范围
|
|
|
|
|
org.apache.poi.hwpf.usermodel.Range documentRange = docDocument.getRange();
|
|
|
|
|
Range documentRange = docDocument.getRange();
|
|
|
|
|
|
|
|
|
|
// 按段落处理
|
|
|
|
|
int numParagraphs = documentRange.numParagraphs();
|
|
|
|
@ -569,7 +718,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < numParagraphs; i++) {
|
|
|
|
|
try {
|
|
|
|
|
org.apache.poi.hwpf.usermodel.Paragraph hwpfParagraph = documentRange.getParagraph(i);
|
|
|
|
|
Paragraph hwpfParagraph = documentRange.getParagraph(i);
|
|
|
|
|
String paragraphText = hwpfParagraph.text();
|
|
|
|
|
|
|
|
|
|
// 跳过空段落和只包含控制字符的段落
|
|
|
|
@ -585,7 +734,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
int numCharacterRuns = hwpfParagraph.numCharacterRuns();
|
|
|
|
|
for (int j = 0; j < numCharacterRuns; j++) {
|
|
|
|
|
try {
|
|
|
|
|
org.apache.poi.hwpf.usermodel.CharacterRun characterRun = hwpfParagraph.getCharacterRun(j);
|
|
|
|
|
CharacterRun characterRun = hwpfParagraph.getCharacterRun(j);
|
|
|
|
|
String runText = characterRun.text();
|
|
|
|
|
|
|
|
|
|
if (runText != null && !runText.trim().isEmpty()) {
|
|
|
|
@ -744,7 +893,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
private String extractTextFromDocSimple(MultipartFile docFile) {
|
|
|
|
|
try {
|
|
|
|
|
// 尝试使用HWPFDocument提取文本(最基本的方式)
|
|
|
|
|
try (java.io.InputStream inputStream = docFile.getInputStream()) {
|
|
|
|
|
try (InputStream inputStream = docFile.getInputStream()) {
|
|
|
|
|
HWPFDocument docDocument = new HWPFDocument(inputStream);
|
|
|
|
|
WordExtractor extractor = new WordExtractor(docDocument);
|
|
|
|
|
String text = extractor.getText();
|
|
|
|
@ -765,7 +914,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
private byte[] convertDocToDocxUsingPOI(MultipartFile docFile) throws Exception {
|
|
|
|
|
ByteArrayOutputStream baos = null;
|
|
|
|
|
XWPFDocument docxDocument = null;
|
|
|
|
|
java.io.InputStream docInputStream = null;
|
|
|
|
|
InputStream docInputStream = null;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
// 创建输入流
|
|
|
|
@ -802,7 +951,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
/**
|
|
|
|
|
* 从DOC文件中提取文本
|
|
|
|
|
*/
|
|
|
|
|
private String extractTextFromDoc(java.io.InputStream docInputStream) throws Exception {
|
|
|
|
|
private String extractTextFromDoc(InputStream docInputStream) throws Exception {
|
|
|
|
|
HWPFDocument docDocument = null;
|
|
|
|
|
WordExtractor extractor = null;
|
|
|
|
|
|
|
|
|
@ -811,8 +960,8 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
if (!docInputStream.markSupported()) {
|
|
|
|
|
// 如果不支持mark,将流转换为ByteArrayInputStream
|
|
|
|
|
// Java 8兼容:使用Apache Commons IO读取所有字节
|
|
|
|
|
byte[] bytes = org.apache.commons.io.IOUtils.toByteArray(docInputStream);
|
|
|
|
|
docInputStream = new java.io.ByteArrayInputStream(bytes);
|
|
|
|
|
byte[] bytes = IOUtils.toByteArray(docInputStream);
|
|
|
|
|
docInputStream = new ByteArrayInputStream(bytes);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
docInputStream.mark(Integer.MAX_VALUE);
|
|
|
|
@ -842,11 +991,11 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
/**
|
|
|
|
|
* 备用的文本提取方法
|
|
|
|
|
*/
|
|
|
|
|
private String extractTextWithAlternativeMethod(java.io.InputStream docInputStream) throws Exception {
|
|
|
|
|
private String extractTextWithAlternativeMethod(InputStream docInputStream) throws Exception {
|
|
|
|
|
try {
|
|
|
|
|
// 尝试使用更宽松的方式读取
|
|
|
|
|
// Java 8兼容:使用Apache Commons IO读取所有字节
|
|
|
|
|
byte[] docBytes = org.apache.commons.io.IOUtils.toByteArray(docInputStream);
|
|
|
|
|
byte[] docBytes = IOUtils.toByteArray(docInputStream);
|
|
|
|
|
|
|
|
|
|
// 简单的文本提取 - 寻找可能的文本内容
|
|
|
|
|
String content = new String(docBytes, "UTF-8");
|
|
|
|
@ -905,7 +1054,7 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
/**
|
|
|
|
|
* 安全关闭资源
|
|
|
|
|
*/
|
|
|
|
|
private void closeResources(java.io.InputStream inputStream, XWPFDocument docxDocument, ByteArrayOutputStream outputStream) {
|
|
|
|
|
private void closeResources(InputStream inputStream, XWPFDocument docxDocument, ByteArrayOutputStream outputStream) {
|
|
|
|
|
try {
|
|
|
|
|
if (inputStream != null) inputStream.close();
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
@ -972,13 +1121,13 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public java.io.InputStream getInputStream() throws IOException {
|
|
|
|
|
return new java.io.ByteArrayInputStream(content);
|
|
|
|
|
public InputStream getInputStream() throws IOException {
|
|
|
|
|
return new ByteArrayInputStream(content);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@Override
|
|
|
|
|
public void transferTo(File dest) throws IOException, IllegalStateException {
|
|
|
|
|
try (java.io.FileOutputStream fos = new java.io.FileOutputStream(dest)) {
|
|
|
|
|
try (FileOutputStream fos = new FileOutputStream(dest)) {
|
|
|
|
|
fos.write(content);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|