深度解析后端代码提交

This commit is contained in:
wenjinbo 2025-07-29 17:47:34 +08:00
parent 7a8413cf38
commit 6b0b97d0a6
12 changed files with 330 additions and 22 deletions

View File

@ -3,9 +3,11 @@ package com.bjtds;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.cloud.openfeign.EnableFeignClients;
import org.springframework.scheduling.annotation.EnableScheduling;
@SpringBootApplication
@EnableFeignClients
@EnableScheduling
public class BriChatServiceApplication {
public static void main(String[] args) {

View File

@ -2,6 +2,7 @@ package com.bjtds.brichat.controller;
import cn.hutool.core.io.resource.InputStreamResource;
import com.bjtds.brichat.entity.dataset.DatasetsDocRenameReq;
import com.bjtds.brichat.entity.dataset.DocumentUploadReq;
import com.bjtds.brichat.entity.dto.PdfTaskDto;
import com.bjtds.brichat.service.DatasetsDocService;
import com.bjtds.brichat.service.dify.DifyDatasetApiService;
import com.bjtds.brichat.util.Constants;
@ -16,8 +17,10 @@ import io.github.guoshiqiufeng.dify.dataset.dto.response.DocumentInfo;
import io.github.guoshiqiufeng.dify.dataset.dto.response.UploadFileInfoResponse;
import io.swagger.annotations.Api;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.http.*;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.client.RestTemplate;
@ -25,6 +28,9 @@ import org.springframework.web.multipart.MultipartFile;
import javax.annotation.Resource;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpStatus;
@ -59,7 +65,8 @@ public class DatasetDocController {
@Value("${dify.url}")
private String difyUrl;
@Autowired
private RedisTemplate<String, Object> redisTemplate;
@PostMapping("/page")
@ -168,6 +175,61 @@ public class DatasetDocController {
}
/**
* 获取深度解析任务列表(正在pdf-> md的列表)
* 返回:PdfTaskDto列表
*
* @return ResultUtils 包含PdfTaskDto列表的响应
*/
@GetMapping("/deepAnalysisList")
public ResultUtils getDeepAnalysisList(){
List<PdfTaskDto> taskList = new ArrayList<>();
try {
// 1. 获取所有任务ID列表
List<Object> taskIds = redisTemplate.opsForList().range(Constants.PDF_TASK_REDIS_KEY + ":list", 0, -1);
if (taskIds == null || taskIds.isEmpty()) {
log.info("当前没有正在处理的PDF转换任务");
return ResultUtils.success(taskList);
}
log.info("获取到{}个PDF转换任务", taskIds.size());
// 2. 遍历任务ID获取具体任务信息
for (Object taskIdObj : taskIds) {
if (taskIdObj == null) {
continue;
}
String taskId = taskIdObj.toString();
try {
// 3. 根据任务ID从Redis获取任务详细信息
String hashKey = Constants.PDF_TASK_REDIS_KEY + ":" + taskId;
Object taskInfoObj = redisTemplate.opsForHash().get(hashKey, "taskInfo");
if (taskInfoObj instanceof PdfTaskDto) {
PdfTaskDto taskInfo = (PdfTaskDto) taskInfoObj;
taskList.add(taskInfo);
log.debug("获取任务信息成功: taskId={}, name={}, percent={}",
taskId, taskInfo.getName(), taskInfo.getPercent());
} else {
log.warn("任务{}的信息格式不正确或不存在", taskId);
}
} catch (Exception e) {
log.error("获取任务{}的详细信息失败: {}", taskId, e.getMessage(), e);
}
}
log.info("成功获取{}个有效的PDF转换任务信息", taskList.size());
return ResultUtils.success(taskList);
} catch (Exception e) {
log.error("获取深度解析任务列表失败: {}", e.getMessage(), e);
return ResultUtils.error("获取深度解析任务列表失败: " + e.getMessage());
}
}

View File

@ -17,6 +17,8 @@ public class DocumentUploadReq implements Serializable {
private String datasetId;
private Boolean deepAnalysis;
@JsonProperty("original_document_id")
@JsonAlias({"originalDocumentId"})
private String originalDocumentId;

View File

@ -4,4 +4,6 @@ import com.bjtds.brichat.entity.dataset.DatasetsDocRenameReq;
public interface DatasetsDocService {
void renameFile(DatasetsDocRenameReq request);
String getUploadById(String id);
}

View File

@ -4,45 +4,76 @@ import com.bjtds.brichat.entity.dataset.DocumentUploadReq;
import com.bjtds.brichat.entity.dataset.RetrievalModel;
import com.bjtds.brichat.entity.dify.DatasetDto;
import com.bjtds.brichat.entity.dify.DifyDatasetResponse;
import com.bjtds.brichat.entity.dto.PdfConversionResponse;
import com.bjtds.brichat.entity.dto.PdfTaskDto;
import com.bjtds.brichat.service.dify.DifyDatasetApiService;
import com.bjtds.brichat.util.Constants;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import io.github.guoshiqiufeng.dify.dataset.DifyDataset;
import io.github.guoshiqiufeng.dify.dataset.dto.request.DatasetInfoRequest;
import io.github.guoshiqiufeng.dify.dataset.dto.response.DatasetInfoResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.ParameterizedTypeReference;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.http.*;
import org.springframework.stereotype.Service;
import org.springframework.util.LinkedMultiValueMap;
import org.springframework.util.MultiValueMap;
import org.springframework.util.StringUtils;
import org.springframework.web.client.RestTemplate;
import org.springframework.web.multipart.MultipartFile;
import javax.annotation.Resource;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
@Service
public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
private static final Logger logger = LoggerFactory.getLogger(DifyDatasetApiServiceImpl.class);
private static final String PDF_TASK_REDIS_KEY = "pdf:conversion:tasks";
@Autowired
private RestTemplate restTemplate;
@Autowired
@Qualifier("redisTemplate")
private RedisTemplate<String, Object> redisTemplate;
//开源组件
@Resource
private DifyDataset difyDatasetService;
// @Value("${bjtds.difyDatasets.datasetPath}")
// @Value("${dify.url}")
// private String datasetPat;
@Value("${dify.url}")
private String difyUrl;
@Value("${dify.dataset.api-key}")
private String apiKey;
@Value("${pdf.conversion.service.url}")
private String pdfConversionServiceUrl;
@Value("${pdf.conversion.service.api-url}")
private String pdfConversionApiUrl;
@Value("${pdf.conversion.service.model-name}")
private String pdfConversionModelName;
@Value("${pdf.conversion.service.max-workers}")
private Integer pdfConversionMaxWorkers;
@Override
public ResponseEntity<DatasetDto> createDataset(String name, String description) {
// 1. 设置请求URL
@ -98,6 +129,23 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
}
/***
*
*
*1 若文件为类型pdf
* 1.1远程调用接口将pdf文件解析并转换markdown文件,获取回调任务id
* 2.1将回调的任务id放入redis队列中
* redis中存储的是一个对象数组,对象包含 name,taskId,percent(解析百分比)
* 3.1定时任务读取任务id的状态,若已解析完毕上传md文件至dify知识库中(在其他类中进行)
*
*2 若文件类型为其他类型,正常执行上传逻辑
*
*
* @param request
* @param file
* @return
* @throws JsonProcessingException
*/
@Override
public ResponseEntity<Map> createDocumentByFile(DocumentUploadReq request, MultipartFile file) throws JsonProcessingException {
// 参数验证
@ -109,6 +157,81 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
throw new IllegalArgumentException("上传文件不能为空");
}
// 获取文件名和扩展名
String originalFilename = file.getOriginalFilename();
if (originalFilename == null) {
throw new IllegalArgumentException("文件名不能为空");
}
String fileExtension = getFileExtension(originalFilename).toLowerCase();
logger.info("上传文件: {}, 扩展名: {}", originalFilename, fileExtension);
// 1. 需深度解析,提取图片中的文本
if (request.getDeepAnalysis()) {
return handlePdfFile(request, file);
} else {
// 2.
return handleNormalFile(request, file);
}
}
/**
* 处理PDF文件
* 1.1 远程调用接口将pdf文件解析并转换markdown文件,获取回调任务id
* 2.1 将回调的任务id放入redis队列中
*/
private ResponseEntity<Map> handlePdfFile(DocumentUploadReq request, MultipartFile file) throws JsonProcessingException {
try {
// 1.1 调用PDF转换服务
String taskId = callPdfConversionService(file);
// 2.1 将任务信息存入Redis
// PdfTaskDto pdfTask = new PdfTaskDto(
// file.getOriginalFilename(),
// taskId,
// 0.0, // 初始化进度为0%
// request.getDatasetId()
// );
DatasetInfoRequest datasetInfoRequest = new DatasetInfoRequest();
datasetInfoRequest.setDatasetId(request.getDatasetId());
DatasetInfoResponse datasetInfo = difyDatasetService.info(datasetInfoRequest);
String datasetName = datasetInfo.getName();
request.setDeepAnalysis(false);
PdfTaskDto pdfTask = PdfTaskDto.builder()
.name(file.getOriginalFilename())
.taskId(taskId)
.percent(0.0)
.datasetId(request.getDatasetId())
.datasetName(datasetName)
.uploadReq(request)
.build();
storePdfTaskToRedis(pdfTask);
logger.info("PDF转换任务已提交任务ID: {}, 文件名: {}", taskId, file.getOriginalFilename());
// 返回成功响应
Map<String, Object> response = new HashMap<>();
response.put("success", true);
response.put("message", "PDF文件已提交转换任务ID: " + taskId);
response.put("task_id", taskId);
response.put("file_name", file.getOriginalFilename());
return ResponseEntity.ok(response);
} catch (Exception e) {
logger.error("PDF文件处理失败: {}", e.getMessage(), e);
throw new RuntimeException("PDF文件处理失败: " + e.getMessage(), e);
}
}
/**
* 处理非PDF文件执行正常上传逻辑
*/
private ResponseEntity<Map> handleNormalFile(DocumentUploadReq request, MultipartFile file) throws JsonProcessingException {
String url = difyUrl + Constants.DATABASE_API + "/{dataset_id}/document/create-by-file";
HttpHeaders headers = new HttpHeaders();
@ -125,7 +248,6 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
// 创建临时请求对象不包含datasetIddatasetId用于URL路径参数
DocumentUploadReq dataRequest = new DocumentUploadReq();
dataRequest.setIndexingTechnique(request.getIndexingTechnique());
dataRequest.setProcessRule(request.getProcessRule());
//设置检索模式默认混合检索
@ -178,5 +300,92 @@ public class DifyDatasetApiServiceImpl implements DifyDatasetApiService {
}
}
/**
* 调用PDF转换服务
*/
private String callPdfConversionService(MultipartFile file) {
String url = pdfConversionServiceUrl + "/v1/pdf2md";
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.MULTIPART_FORM_DATA);
MultiValueMap<String, Object> body = new LinkedMultiValueMap<>();
// 添加PDF文件
try {
body.add("pdf_file", new ByteArrayResource(file.getBytes()) {
@Override
public String getFilename() {
return file.getOriginalFilename();
}
});
} catch (IOException e) {
throw new RuntimeException("PDF文件读取失败: " + e.getMessage(), e);
}
// 添加可选参数
body.add("api_url", pdfConversionApiUrl);
body.add("model_name", pdfConversionModelName);
body.add("max_workers", pdfConversionMaxWorkers.toString());
HttpEntity<MultiValueMap<String, Object>> requestEntity = new HttpEntity<>(body, headers);
try {
ResponseEntity<PdfConversionResponse> response = restTemplate.exchange(
url,
HttpMethod.POST,
requestEntity,
PdfConversionResponse.class
);
if (response.getBody() == null || !StringUtils.hasText(response.getBody().getTaskId())) {
throw new RuntimeException("PDF转换服务返回的任务ID为空");
}
return response.getBody().getTaskId();
} catch (Exception e) {
logger.error("调用PDF转换服务失败: {}", e.getMessage(), e);
throw new RuntimeException("调用PDF转换服务失败: " + e.getMessage(), e);
}
}
/**
* 将PDF任务信息存储到Redis
*/
private void storePdfTaskToRedis(PdfTaskDto pdfTask) {
try {
// 使用Hash结构存储key为任务IDvalue为任务信息
String hashKey = PDF_TASK_REDIS_KEY + ":" + pdfTask.getTaskId();
redisTemplate.opsForHash().put(hashKey, "taskInfo", pdfTask);
// 设置过期时间为24小时
redisTemplate.expire(hashKey, 24, TimeUnit.HOURS);
// 同时将任务ID加入到任务列表中便于定时任务扫描
redisTemplate.opsForList().rightPush(PDF_TASK_REDIS_KEY + ":list", pdfTask.getTaskId());
logger.info("PDF任务信息已存储到Redis: {}", pdfTask);
} catch (Exception e) {
logger.error("存储PDF任务信息到Redis失败: {}", e.getMessage(), e);
throw new RuntimeException("存储PDF任务信息到Redis失败: " + e.getMessage(), e);
}
}
/**
* 获取文件扩展名
*/
private String getFileExtension(String filename) {
if (filename == null || filename.isEmpty()) {
return "";
}
int lastDotIndex = filename.lastIndexOf('.');
if (lastDotIndex == -1 || lastDotIndex == filename.length() - 1) {
return "";
}
return filename.substring(lastDotIndex + 1);
}
}

View File

@ -1,5 +1,6 @@
package com.bjtds.brichat.service.impl;
import cn.hutool.json.JSONUtil;
import com.bjtds.brichat.entity.dataset.DatasetsDocRenameReq;
import com.bjtds.brichat.mapper.postgresql.DifyDatasetsDocMapper;
import com.bjtds.brichat.service.DatasetsDocService;
@ -25,4 +26,11 @@ public class DatasetsDocServiceImpl implements DatasetsDocService {
throw new RuntimeException("文档未找到或更新失败");
}
}
@Override
public String getUploadById(String id) {
String dataSourceById = difyDatasetsDocMapper.getDataSourceById(id);
String uploadId = JSONUtil.parseObj(dataSourceById).getStr("upload_file_id");
return uploadId;
}
}

View File

@ -10,14 +10,11 @@ import com.bjtds.brichat.entity.dify.DifyUploadFile;
import com.bjtds.brichat.entity.dto.FilePreviewDto;
import com.bjtds.brichat.mapper.postgresql.DifyUploadFileMapper;
import com.bjtds.brichat.service.ChatMessageService;
import com.bjtds.brichat.service.DocumentService;
import com.bjtds.brichat.service.DatasetsDocService;
import com.bjtds.brichat.service.FileService;
import com.bjtds.brichat.util.Constants;
import com.bjtds.brichat.util.FileUploadUtil;
import com.bjtds.brichat.util.PdfUtils;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import io.github.guoshiqiufeng.dify.dataset.DifyDataset;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
@ -36,7 +33,6 @@ import java.io.*;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.*;
import java.util.stream.Collectors;
@Slf4j
@Service
@ -73,7 +69,7 @@ public class FileServiceImpl implements FileService {
private DifyUploadFileMapper difyUploadFileMapper;
@Resource
private DocumentService documentService;
private DatasetsDocService datasetsDocService;
// @Override
// public String fileUpload(MultipartFile multiFile, String filePath) {
@ -384,7 +380,7 @@ public class FileServiceImpl implements FileService {
String sysMessageId = (String)traceJson.get("messageId");
List<TraceFile> traceFiles = new ArrayList<>();
for( String key : tracePdf.keySet()){
String uploadId = documentService.getUploadById(key);
String uploadId = datasetsDocService.getUploadById(key);
//获取文件上传地址
// /var/www/dify/storage + "/" + upload_files/1f93db68-fb8c-4d58-bff2-3e91d72757cf/9b12ca0d-dd5a-4d4e-9d3f-946227ae4e84.xlsx
DifyUploadFile uploadPath = difyUploadFileMapper.getFileById(uploadId);

View File

@ -261,7 +261,14 @@ public class PdfConversionTaskService {
}
// 2. 创建MultipartFile
MultipartFile multipartFile = createMultipartFileFromPath(path);
MultipartFile multipartFile = createMultipartFileFromPath(path, taskInfo.getName());
//修改文件名
// multipartFile = new SimpleMultipartFile(
// multipartFile.getName(),
// taskInfo.getName() + ".md",
// multipartFile.getContentType(),
// multipartFile.getBytes()
// );
// 3. 调用上传接口
@ -278,9 +285,8 @@ public class PdfConversionTaskService {
/**
* 从文件路径创建MultipartFile (JDK 1.8兼容版本)
*/
private MultipartFile createMultipartFileFromPath(Path filePath) throws IOException {
private MultipartFile createMultipartFileFromPath(Path filePath, String fileName) throws IOException {
File file = filePath.toFile();
String fileName = file.getName();
String originalFileName = fileName.endsWith(".md") ? fileName : fileName + ".md";
// JDK 1.8兼容的文件读取方式

View File

@ -16,4 +16,7 @@ public class Constants {
public static final String BEARER = "Bearer ";
public static final String DATABASE_API = "/v1/datasets";
public static final String PDF_TASK_REDIS_KEY = "pdf:conversion:tasks";
}

View File

@ -48,4 +48,13 @@ dify:
email: bjtds@bjtds.com # 请替换为实际的 Dify 服务邮箱,若不需要调用 server相关接口可不填
password: 123456Aa # 请替换为实际的 Dify 服务密码,若不需要调用 server相关接口可不填
dataset:
api-key: ${dify-dataset-api-key:dataset-zVa4uJBUem96P19o8iBtyihQ} # 请替换为实际的知识库api-key, 若不需要调用知识库可不填
api-key: ${dify-dataset-api-key:dataset-zVa4uJBUem96P19o8iBtyihQ} # 请替换为实际的知识库api-key, 若不需要调用知识库可不填
# PDF转换服务配置
pdf:
conversion:
service:
url: ${pdf-conversion-url:http://192.168.1.211:12201} # PDF转换服务地址
api-url: ${pdf-conversion-api-url:http://192.168.1.211:1050/v1/chat/completions} # QwenVL API服务地址
model-name: ${pdf-conversion-model:qwenvl} # 使用的模型名称
max-workers: ${pdf-conversion-max-workers:10} # 并发线程数

View File

@ -48,4 +48,13 @@ dify:
email: bjtds@bjtds.com # 请替换为实际的 Dify 服务邮箱,若不需要调用 server相关接口可不填
password: 123456Aa # 请替换为实际的 Dify 服务密码,若不需要调用 server相关接口可不填
dataset:
api-key: ${dify-dataset-api-key:dataset-0Hij9IwoWYbJe1vvwVh8y7DS} # 请替换为实际的知识库api-key, 若不需要调用知识库可不填
api-key: ${dify-dataset-api-key:dataset-0Hij9IwoWYbJe1vvwVh8y7DS} # 请替换为实际的知识库api-key, 若不需要调用知识库可不填
# PDF转换服务配置
pdf:
conversion:
service:
url: ${pdf-conversion-url:http://192.168.8.253:12201} # PDF转换服务地址
api-url: ${pdf-conversion-api-url:http://192.168.8.253:1050/v1/chat/completions} # QwenVL API服务地址
model-name: ${pdf-conversion-model:qwenvl} # 使用的模型名称
max-workers: ${pdf-conversion-max-workers:10} # 并发线程数

View File

@ -18,13 +18,13 @@ spring:
# 可选值: wuhan, beijing
# 线上部署时可通过环境变量 SPRING_PROFILES_ACTIVE 覆盖
profiles:
active: beijing
active: wuhan
# 文件上传配置
servlet:
multipart:
max-request-size: 10MB
max-file-size: 10MB
max-request-size: 100MB
max-file-size: 100MB
# 数据源配置已移至对应的环境配置文件
# application-wuhan.yml 和 application-beijing.yml