-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathknowledge_base.py
More file actions
111 lines (92 loc) · 3.42 KB
/
knowledge_base.py
File metadata and controls
111 lines (92 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
'''
入库线核心代码,查重-切片-metadata-入库
'''
import logging
import logger_config
import config_data as config
import os
import hashlib
from datetime import datetime
from langchain_chroma import Chroma
from langchain_community.embeddings import DashScopeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
logger = logging.getLogger(__name__)
#通过md5检查文件是否已经处理过
def check_md5(md5_str: str):
if not os.path.exists(config.md5_path):
open(config.md5_path,'w',encoding='utf-8').close()
return False
else:
with open(config.md5_path,'r',encoding='utf-8') as f:
for line in f:
line = line.strip()
if line == md5_str:
return True
return False
#保存md5到文件
def save_md5(md5_str: str):
with open(config.md5_path,'a',encoding='utf-8') as f:
f.write(md5_str+'\n')
#计算md5
def get_string_md5(input_str: str, encoding='utf-8'):
md5_bytes = input_str.encode(encoding=encoding)
md5_obj = hashlib.md5()
md5_obj.update(md5_bytes)
md5_hex = md5_obj.hexdigest()
return md5_hex
class KnowledgeBaseService:
def __init__(self):
# 1. 确保存储目录存在
# 2. 创建 self.chroma
# 3. 创建 self.splitter
os.makedirs(config.persist_directory, exist_ok=True)
#向量存储
self.chroma=Chroma(
collection_name=config.collection_name,
persist_directory=config.persist_directory,
embedding_function=DashScopeEmbeddings(model=config.embedding_model_name)
)
#文本分割器
self.splitter=RecursiveCharacterTextSplitter(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
separators=config.separators,
length_function=len
)
def upload_by_str(self, data, filename):
logger.info(f'开始处理文件: {filename}')
try:
if not data or not data.strip():
logger.warning(f'文件内容为空,跳过入库: {filename}')
return '[跳过]文件内容为空'
# 1. 算 md5
md5_hex = get_string_md5(data)
# 2. 查重
if check_md5(md5_hex):
logger.warning(f'文件已存在,跳过入库: {filename}')
return '[跳过]该文件内容已处理过'
# 3. 判断是否切片
if len(data) > config.max_split_char_number:
knowledge_chunks = self.splitter.split_text(data)
else:
knowledge_chunks = [data]
logger.info(f'文本切片数量: {len(knowledge_chunks)}')
# 4. metadata
metadata = {
'source': filename,
'create_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'operator': 'rabbit'
}
# 5. 入库
self.chroma.add_texts(
knowledge_chunks,
metadatas=[metadata for _ in knowledge_chunks]
)
# 6. 保存 md5
save_md5(md5_hex)
logger.info(f'文件入库成功: {filename}')
# 7. 返回成功
return '[入库成功]'
except Exception as e:
logger.exception(f'文件入库失败: {filename}')
return f'[失败]入库出错: {e}'