forked from 1Panel-dev/MaxKB
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8fe1a14
commit c89ae29
Showing
20 changed files
with
581 additions
and
141 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
# coding=utf-8 | ||
""" | ||
@project: maxkb | ||
@Author:虎 | ||
@file: ts_vecto_util.py | ||
@date:2024/4/16 15:26 | ||
@desc: | ||
""" | ||
import re | ||
import uuid | ||
from typing import List | ||
|
||
import jieba | ||
from jieba import analyse | ||
|
||
from common.util.split_model import group_by | ||
|
||
jieba_word_list_cache = [chr(item) for item in range(38, 84)] | ||
|
||
for jieba_word in jieba_word_list_cache: | ||
jieba.add_word('#' + jieba_word + '#') | ||
# r"(?i)\b(?:https?|ftp|tcp|file)://[^\s]+\b", | ||
# 某些不分词数据 | ||
# r'"([^"]*)"' | ||
word_pattern_list = [r"v\d+.\d+.\d+", | ||
r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}"] | ||
|
||
remove_chars = '\n , :\'<>!@#¥%……&*()!@#$%^&*(): ;,/"./-' | ||
|
||
|
||
def get_word_list(text: str): | ||
result = [] | ||
for pattern in word_pattern_list: | ||
word_list = re.findall(pattern, text) | ||
for child_list in word_list: | ||
for word in child_list if isinstance(child_list, tuple) else [child_list]: | ||
# 不能有: 所以再使用: 进行分割 | ||
if word.__contains__(':'): | ||
item_list = word.split(":") | ||
for w in item_list: | ||
result.append(w) | ||
else: | ||
result.append(word) | ||
return result | ||
|
||
|
||
def replace_word(word_dict, text: str): | ||
for key in word_dict: | ||
text = re.sub('(?<!#)' + word_dict[key] + '(?!#)', key, text) | ||
return text | ||
|
||
|
||
def get_word_key(text: str, use_word_list): | ||
for j_word in jieba_word_list_cache: | ||
if not text.__contains__(j_word) and not use_word_list.__contains__(j_word): | ||
return j_word | ||
j_word = str(uuid.uuid1()) | ||
jieba.add_word(j_word) | ||
return j_word | ||
|
||
|
||
def to_word_dict(word_list: List, text: str): | ||
word_dict = {} | ||
for word in word_list: | ||
key = get_word_key(text, set(word_dict)) | ||
word_dict['#' + key + '#'] = word | ||
return word_dict | ||
|
||
|
||
def get_key_by_word_dict(key, word_dict): | ||
v = word_dict.get(key) | ||
if v is None: | ||
return key | ||
return v | ||
|
||
|
||
def to_ts_vector(text: str): | ||
# 获取不分词的数据 | ||
word_list = get_word_list(text) | ||
# 获取关键词关系 | ||
word_dict = to_word_dict(word_list, text) | ||
# 替换字符串 | ||
text = replace_word(word_dict, text) | ||
# 分词 | ||
result = jieba.tokenize(text, mode='search') | ||
result_ = [{'word': get_key_by_word_dict(item[0], word_dict), 'index': item[1]} for item in result] | ||
result_group = group_by(result_, lambda r: r['word']) | ||
return " ".join( | ||
[f"{key.lower()}:{','.join([str(item['index'] + 1) for item in result_group[key]][:20])}" for key in | ||
result_group if | ||
not remove_chars.__contains__(key) and len(key.strip()) >= 0]) | ||
|
||
|
||
def to_query(text: str): | ||
# 获取不分词的数据 | ||
word_list = get_word_list(text) | ||
# 获取关键词关系 | ||
word_dict = to_word_dict(word_list, text) | ||
# 替换字符串 | ||
text = replace_word(word_dict, text) | ||
extract_tags = analyse.extract_tags(text, topK=5, withWeight=True, allowPOS=('ns', 'n', 'vn', 'v', 'eng')) | ||
result = " ".join([get_key_by_word_dict(word, word_dict) for word, score in extract_tags if | ||
not remove_chars.__contains__(word)]) | ||
# 删除词库 | ||
for word in word_list: | ||
jieba.del_word(word) | ||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# Generated by Django 4.1.13 on 2024-04-16 11:43 | ||
|
||
import django.contrib.postgres.search | ||
from django.db import migrations | ||
|
||
from common.util.common import sub_array | ||
from common.util.ts_vecto_util import to_ts_vector | ||
from dataset.models import Status | ||
from embedding.models import Embedding | ||
|
||
|
||
def update_embedding_search_vector(embedding, paragraph_list): | ||
paragraphs = [paragraph for paragraph in paragraph_list if paragraph.id == embedding.get('paragraph')] | ||
if len(paragraphs) > 0: | ||
content = paragraphs[0].title + paragraphs[0].content | ||
return Embedding(id=embedding.get('id'), search_vector=to_ts_vector(content)) | ||
return Embedding(id=embedding.get('id'), search_vector="") | ||
|
||
|
||
def save_keywords(apps, schema_editor): | ||
document = apps.get_model("dataset", "Document") | ||
embedding = apps.get_model("embedding", "Embedding") | ||
paragraph = apps.get_model('dataset', 'Paragraph') | ||
db_alias = schema_editor.connection.alias | ||
document_list = document.objects.using(db_alias).all() | ||
for document in document_list: | ||
document.status = Status.embedding | ||
document.save() | ||
paragraph_list = paragraph.objects.using(db_alias).filter(document=document).all() | ||
embedding_list = embedding.objects.using(db_alias).filter(document=document).values('id', 'search_vector', | ||
'paragraph') | ||
embedding_update_list = [update_embedding_search_vector(embedding, paragraph_list) for embedding | ||
in embedding_list] | ||
child_array = sub_array(embedding_update_list, 20) | ||
for c in child_array: | ||
try: | ||
embedding.objects.using(db_alias).bulk_update(c, ['search_vector']) | ||
except Exception as e: | ||
print(e) | ||
|
||
|
||
class Migration(migrations.Migration): | ||
dependencies = [ | ||
('embedding', '0001_initial'), | ||
] | ||
|
||
operations = [ | ||
migrations.AddField( | ||
model_name='embedding', | ||
name='search_vector', | ||
field=django.contrib.postgres.search.SearchVectorField(default='', verbose_name='分词'), | ||
), | ||
migrations.RunPython(save_keywords) | ||
] |
Oops, something went wrong.