AI智能客服-牧云

1. 表设计

1.1. 问答文件表

-- 智能客服 Markdown 文件存储表
CREATE TABLE t_ai_customer_service_md_storage (
    id BIGSERIAL PRIMARY KEY,
    original_file_name VARCHAR(160) NOT NULL,
    new_file_name VARCHAR(160) NOT NULL,
    file_path VARCHAR(500) NOT NULL,
    file_size BIGINT NOT NULL,
    status SMALLINT DEFAULT 0,
    remark VARCHAR(200),
    create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

-- 添加表注释
COMMENT ON TABLE t_ai_customer_service_md_storage IS '问答 Markdown 文件存储表';

-- 添加字段注释
COMMENT ON COLUMN t_ai_customer_service_md_storage.id IS '主键ID';
COMMENT ON COLUMN t_ai_customer_service_md_storage.original_file_name IS '原始文件名称';
COMMENT ON COLUMN t_ai_customer_service_md_storage.new_file_name IS '新命名文件名称（防止名称相同导致覆盖）';
COMMENT ON COLUMN t_ai_customer_service_md_storage.file_path IS '文件存储路径';
COMMENT ON COLUMN t_ai_customer_service_md_storage.file_size IS '文件大小(字节)';
COMMENT ON COLUMN t_ai_customer_service_md_storage.status IS '处理状态：0-待处理 1-向量化中 2-已完成 3-失败';
COMMENT ON COLUMN t_ai_customer_service_md_storage.remark IS '备注信息';
COMMENT ON COLUMN t_ai_customer_service_md_storage.create_time IS '创建时间';
COMMENT ON COLUMN t_ai_customer_service_md_storage.update_time IS '更新时间';

-- 创建索引
CREATE INDEX idx_t_ai_customer_service_md_storage_status ON t_ai_customer_service_md_storage(status);
CREATE INDEX idx_t_ai_customer_service_md_storage_created_time ON t_ai_customer_service_md_storage(create_time);
CREATE INDEX idx_t_ai_customer_service_md_storage_original_file_name ON t_ai_customer_service_md_storage(original_file_name);

1.2. 向量表

问答文件上传完毕后，需要对文件进行向量化存储

-- 启用扩展
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS hstore;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";

CREATE TABLE IF NOT EXISTS t_vector_store (
	id uuid DEFAULT uuid_generate_v4() PRIMARY KEY,
	content text,
	metadata json,
	embedding vector(1536)
);

-- 创建索引
CREATE INDEX ON t_vector_store USING HNSW (embedding vector_cosine_ops);

要想 PostgreSQL 支持向量化存储，需要先启用 vector、hstore 和 uuid-ossp 扩展；
t_vector_store 向量化表, 相关表字段作用如下：
id : 主键，使用 uuid 类型，若不指定，默认由 uuid_generate_v4() 函数（由 uuid-ossp 扩展提供）会自动生成一个随机的UUID。
content: 存储原始的文本内容。
metadata : 存储附加信息，如文件名、作者、类型等等；
embedding: 向量数据，1536 是向量维度，需要与向量模型指定的维度保持一致，若使用的是生成 384 或 768 维的向量模型，必须修改这个数字。
USING HNSW: 指定使用 HNSW 索引算法。HNSW 是一种为高维向量相似性搜索设计的先进算法。它像一张高速公路网，能让你在庞大的向量空间中快速找到近似最近邻，而无需遍历每一个向量。
(embedding vector_cosine_ops): 指定对哪个字段和用什么方法进行索引。
embedding：要在此字段上创建索引的向量列。
vector_cosine_ops：表示这个索引是为余弦相似度 计算优化的。
余弦相似度 是衡量向量方向相似性的指标，在文本语义搜索中非常有效，因为它能捕捉到含义上的相似性，而不受向量大小（magnitude）的影响。

2. Markdown 问答文件上传接口开发

涉及到文件上传，此接口使用 “表单方式” 提交，即 content-type 为 multipart/form-data:

POST /customer-service/md/upload

import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;

import java.time.LocalDateTime;

/**
 * @Description: AI 客服 Markdown 问答文件存储
 **/
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
@TableName("t_ai_customer_service_md_storage")
public class AiCustomerServiceMdStorageDO {

    @TableId(type = IdType.AUTO)
    private Long id;
    private String originalFileName;
    private String newFileName;
    private String filePath;
    private Long fileSize;
    private Integer status;
    private String remark;
    private LocalDateTime createTime;
    private LocalDateTime updateTime;
}

<dependency>
    <groupId>commons-io</groupId>
    <artifactId>commons-io</artifactId>
    <version>${commons-io.version}</version>
</dependency>

@Service
@Slf4j
public class CustomerServiceImpl implements CustomerService {

    @Value("${customer-service.md-storage-path}")
    private String mdStoragePath;

    @Resource
    private AiCustomerServiceMdStorageMapper aiCustomerServiceMdStorageMapper;

    /**
     * 上传 Markdown 问答文件
     *
     * @param file
     * @return
     */
    @Override
    public Response<?> uploadMarkdownFile(MultipartFile file) {
        // 校验文件不能为空
        if (file == null || file.isEmpty()) {
            throw new BizException(ResponseCodeEnum.UPLOAD_FILE_CANT_EMPTY);
        }

        // 获取原始文件名（去除空格）
        String originalFilename = StringUtils.trimToEmpty(file.getOriginalFilename());

        // 验证文件类型，仅支持 Markdown
        if (StringUtils.isBlank(originalFilename) || !isMarkdownFile(originalFilename)) {
            throw new BizException(ResponseCodeEnum.ONLY_SUPPORT_MARKDOWN);
        }

        try {
            // 重新生成文件名 (防止文件名冲突导致覆盖)
            String newFilename = UUID.randomUUID().toString() + "-" + originalFilename;

            // 构建存储路径
            Path storageDirectory = Paths.get(mdStoragePath);
            Path targetPath = storageDirectory.resolve(newFilename);

            // 确保目录存在
            Files.createDirectories(storageDirectory);

            // 保存文件
            file.transferTo(targetPath.toFile());

            // 记录操作日志
            log.info("## Markdown 问答文件存储成功, 文件名：{} -> 存储路径：{}", originalFilename, targetPath);

            // 存储入库
            aiCustomerServiceMdStorageMapper.insert(AiCustomerServiceMdStorageDO.builder()
                            .originalFileName(originalFilename)
                            .newFileName(newFilename)
                            .filePath(targetPath.toString())
                            .fileSize(file.getSize())
                            .status(AiCustomerServiceMdStatusEnum.PENDING.getCode())
                            .createTime(LocalDateTime.now())
                            .updateTime(LocalDateTime.now())
                            .build());


            return Response.success();

        } catch (IOException e) {
            log.error("## Markdown 问答文件上传失败：{}", originalFilename, e);
            throw new BizException(ResponseCodeEnum.UPLOAD_FILE_FAILED);
        }
    }

    /**
     * 验证文件是否为 Markdown 格式
     */
    private boolean isMarkdownFile(String filename) {
        if (StringUtils.isBlank(filename)) {
            return false;
        }

        // 获取文件扩展名
        String extension = FilenameUtils.getExtension(filename);
        return StringUtils.equalsIgnoreCase(extension, "md");
    }
}

@RestController
@RequestMapping("/customer-service")
@Slf4j
public class AiCustomerServiceController {

    @Resource
    private CustomerService customerService;

    /**
     * 问答 MD 文件上传
     * @param file
     * @return
     */
    @PostMapping(value = "/md/upload", consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
    public Response<?> uploadMarkdownFile(@RequestPart(value = "file", required = false) MultipartFile file) {
        return customerService.uploadMarkdownFile(file);
    }

}

3. Spring Event

/**
 * @Version: v1.0.0
 * @Description: 文件上传事件处理
 **/
@Component
@Slf4j
public class AiCustomerServiceMdUploadedListener {

    /**
     * Markdown 文件向量化
     * @param event
     */
    @EventListener
    @Async("eventTaskExecutor") // 指定使用我们自定义的线程池
    public void vectorizing(AiCustomerServiceMdUploadedEvent event) {
        log.info("## AiCustomerServiceMdUploadedEvent: {}", event);

        // TODO Markdown 文件向量化
    }
}

4. 分片上传、断点续传、秒传实现思路与表设计

4.1. 分片上传

前端将大文件（如10G）进行分片处理，切割为无数个小文件，比如每片10M;
前端将这些分片小文件独立并行的上传，互不干扰；
服务端分别接受这些小文件，等所有分片小文件均上传成功后，再进行分片合并处理，还原为原始的大文件；

4.2. 断点续传

分片上传过程中，服务端会存储 “已上传的分片进度”；
如果中断了，再次上传相同文件时，前端会询问服务端：上次我上传到哪儿了？
服务端返回上次的进度，比如 30%；
前端：好嘞，那我从 30% 继续；

4.3. 秒传

为每个上传文件赋予一个 “身份标识”，如计算文件 MD5 值；
前端上传文件时，会询问服务端：当前这个文件，服务端已经有了没？
服务端根据 MD5 值查询，判断文件是否已被上传，若已存在，直接做个 “快捷方式” 引用；
前端拿到响应，知道被上传的文件已存在，不再进行 “分片处理”，直接提示 “上传成功”；

4.4. 表设计

4.4.1. 文件元数据表

-- 建表语句
create table t_ai_customer_service_file_storage
(
    id              bigserial
        primary key,
    file_name       varchar(160)                        not null,
    file_md5        varchar(32)                         not null,
    file_path       varchar(500)                        not null,
    file_size       bigint                              not null,
    total_chunks    integer                             not null,
    uploaded_chunks integer   default 0                 not null,
    status          smallint  default 0,
    remark          varchar(200),
    create_time     timestamp default CURRENT_TIMESTAMP not null,
    update_time     timestamp default CURRENT_TIMESTAMP not null
);

-- 添加字段注释
comment on table t_ai_customer_service_file_storage is 'AI客服文件存储表';

comment on column t_ai_customer_service_file_storage.id is '主键ID';

comment on column t_ai_customer_service_file_storage.file_name is '原始文件名';

comment on column t_ai_customer_service_file_storage.file_md5 is '文件MD5值，用于秒传和去重';

comment on column t_ai_customer_service_file_storage.file_path is '文件存储路径';

comment on column t_ai_customer_service_file_storage.file_size is '文件大小（字节）';

comment on column t_ai_customer_service_file_storage.total_chunks is '总分片数';

comment on column t_ai_customer_service_file_storage.uploaded_chunks is '已上传分片数';

comment on column t_ai_customer_service_file_storage.status is '处理状态：0-上传中 1-待处理 2-向量化中 3-已完成 4-失败';

comment on column t_ai_customer_service_file_storage.remark is '备注';

comment on column t_ai_customer_service_file_storage.create_time is '创建时间';

comment on column t_ai_customer_service_file_storage.update_time is '更新时间';


-- 为文件 MD5 值添加唯一索引
create unique index uk_file_md5
    on t_ai_customer_service_file_storage (file_md5);

comment on index uk_file_md5 is '文件MD5唯一索引，用于秒传去重';

-- 其他可能作为 SQL 搜索条件的字段值，也添加一下索引，提升查询速度
create index idx_status
    on t_ai_customer_service_file_storage (status);

comment on index idx_status is '状态索引，用于快速查询不同状态的文件';

create index idx_create_time
    on t_ai_customer_service_file_storage (create_time);

comment on index idx_create_time is '创建时间索引，用于按时间范围查询';

create index idx_file_name
    on t_ai_customer_service_file_storage (file_name);

comment on index idx_file_name is '文件名索引，支持文件名搜索';

4.4.2. 文件分片表

-- 建表语句
create table t_file_chunk_info
(
    id           bigserial
        primary key,
    file_md5     varchar(32)                         not null,
    chunk_number integer                             not null,
    chunk_path   varchar(500)                        not null,
    chunk_size   bigint                              not null,
    create_time  timestamp default CURRENT_TIMESTAMP not null
);

-- 添加字段注释
comment on table t_file_chunk_info is '分片信息表';

comment on column t_file_chunk_info.id is '主键ID';

comment on column t_file_chunk_info.file_md5 is '文件MD5值';

comment on column t_file_chunk_info.chunk_number is '分片序号（从0开始）';

comment on column t_file_chunk_info.chunk_path is '分片文件存储路径';

comment on column t_file_chunk_info.chunk_size is '分片大小（字节）';

comment on column t_file_chunk_info.create_time is '创建时间';

-- 可能作为 SQL 搜索条件的字段值，添加一下索引，提升查询速度
create index idx_file_md5
    on t_file_chunk_info (file_md5);

comment on index idx_file_md5 is '文件MD5索引，用于查询某文件的所有分片';

id : 注解 ID;
file_md5 : 原始文件的 MD5 值，用于关联 t_ai_customer_service_file_storage 表；
chunk_number : 分片序号，所有分片小文件，都有其对应序号，从 0 开始排序；
chunk_path : 分片文件存储路径；
chunk_size: 分片文件大小，单位字节；
create_time : 创建时间；

4.5. 接口开发

文件检查（秒传）接口：前端在对大文件进行分片切割之前，需要计算该文件的 MD5 值，然后上传给后端。此接口拿到文件 MD5 值后，查询数据库，判断当前文件之前是否已上传过：

若无记录，前端正常执行后续的分片上传操作；
若有记录:

后端记录的总分片数与已上传分片数一致，则无需再次上传，秒传成功；
已上传分片数小于总分片数，说明上传中断了，需要断点续传；

文件分片上传接口：前端分片处理完成后，并行上传这些小文件到服务器。
分片文件合并接口：所有分片小文件均上传完毕后，调用此接口，通知服务端进行合并操作。

目录CONTENT

AI智能客服

1. 表设计

1.1. 问答文件表

1.2. 向量表

2. Markdown 问答文件上传接口开发

3. Spring Event

4. 分片上传、断点续传、秒传实现思路与表设计

4.1. 分片上传

4.2. 断点续传

4.3. 秒传

4.4. 表设计

4.4.1. 文件元数据表

4.4.2. 文件分片表

4.5. 接口开发

评论区