isekai-toolkit/service/embedding_search.py

from __future__ import annotations
from typing import Optional, TypedDict

import sqlalchemy
from api.model.embedding_search.title_collection import (
    TitleCollectionHelper,
    TitleCollectionModel,
)
from api.model.embedding_search.title_index import TitleIndexHelper, TitleIndexModel
from api.model.embedding_search.page_index import PageIndexHelper
from service.database import DatabaseService
from service.mediawiki_api import MediaWikiApi
from service.openai_api import OpenAIApi
from service.tiktoken import TikTokenService
from utils.wiki import getWikiSentences


class EmbeddingRunningException(Exception):
    pass


class EmbeddingSearchArgs(TypedDict):
    limit: Optional[int]
    in_collection: Optional[bool]
    distance_limit: Optional[float]


class EmbeddingSearchService:
    indexing_page_ids: list[int] = []

    def __init__(self, dbs: DatabaseService, title: str):
        self.dbs = dbs

        self.title = title
        self.base_title = title.split("/")[0]

        self.title_index_helper = TitleIndexHelper(dbs)
        self.title_collection_helper = TitleCollectionHelper(dbs)
        self.page_index: PageIndexHelper = None

        self.tiktoken: TikTokenService = None

        self.mwapi = MediaWikiApi.create()
        self.openai_api = OpenAIApi.create()

        self.page_id: Optional[int] = None
        self.collection_id: Optional[int] = None

        self.title_index: Optional[TitleIndexModel] = None
        self.collection_info: Optional[TitleCollectionModel] = None

        self.page_info: dict = None
        self.unindexed_docs: list = None

    async def __aenter__(self):
        self.tiktoken = await TikTokenService.create()

        await self.title_index_helper.__aenter__()
        await self.title_collection_helper.__aenter__()

        self.title_index = await self.title_index_helper.find_by_title(self.title)
        if self.title_index is None:
            # Title may changed, get page info from page_id
            await self.load_page_info()
            self.title_index = await self.title_index_helper.find_by_page_id(
                self.page_info["pageid"]
            )
            self.page_id = self.page_info["pageid"]
        else:
            self.page_id = self.title_index.page_id
            self.collection_id = self.title_index.collection_id
            self.page_index = PageIndexHelper(self.dbs, self.collection_id)
            await self.page_index.__aenter__()

        return self

    async def __aexit__(self, exc_type, exc, tb):
        await self.title_index_helper.__aexit__(exc_type, exc, tb)
        await self.title_collection_helper.__aexit__(exc_type, exc, tb)

        if self.page_index is not None:
            await self.page_index.__aexit__(exc_type, exc, tb)

    async def page_index_exists(self, check_table=True):
        if check_table:
            return self.page_index and await self.page_index.table_exists()
        else:
            return self.page_index is not None

    async def load_page_info(self, reload=False):
        if self.page_info is None or reload:
            self.page_info = await self.mwapi.get_page_info(self.title)

    async def should_update_page_index(self, remote_update=False):
        if not remote_update:
            if self.title_index is None:
                return True
            return self.title_index.indexed_rev_id != self.title_index.latest_rev_id
        else:
            await self.load_page_info()

            if (
                self.title_index is not None
                and await self.page_index_exists()
                and self.title_index.indexed_rev_id == self.page_info["lastrevid"]
            ):
                # Not changed
                return False

            return True

    async def update_title_index(self, remote_update=False):
        if not await self.should_update_page_index(remote_update):
            return False

        await self.load_page_info()

        self.page_id = self.page_info["pageid"]

        if self.page_id in self.indexing_page_ids:
            raise EmbeddingRunningException("Page index is running now")

        self.title: str = self.page_info["title"]
        self.base_title = self.title.split("/")[0]

        # Find collection by base title
        self.collection_info = await self.title_collection_helper.find_by_title(
            self.base_title
        )
        if self.collection_info is None:
            # Create collection
            self.collection_info = await self.title_collection_helper.add(
                self.base_title
            )
            if self.collection_info is None:
                raise Exception("Failed to create title collection")
        self.collection_id = self.collection_info.id

        if self.title_index == None:
            # Create title record
            self.title_index = TitleIndexModel(
                title=self.page_info["title"],
                page_id=self.page_id,
                indexed_rev_id=None,
                latest_rev_id=self.page_info["lastrevid"],
                collection_id=self.collection_id,
                embedding=None,
            )
            self.title_index = await self.title_index_helper.add(self.title_index)
            if self.title_index is None:
                raise Exception("Failed to create title index")
        else:
            self.title_index.latest_rev_id = self.page_info["lastrevid"]
            # Title changed, remove embedding
            # Title sha1 will be updated by model helper
            if self.title_index.title != self.page_info["title"]:
                self.title_index.title = self.page_info["title"]
                self.title_index.embedding = None

            # Collection changed, remove old index
            if self.collection_id != self.title_index.collection_id:
                async with PageIndexHelper(self.dbs, self.title_index.collection_id) as old_page_index:
                    await old_page_index.init_table()
                    old_page_index.remove_by_page_id(self.page_id)
                self.title_index.collection_id = self.collection_id

            await self.title_index_helper.update(self.title_index)

        # Update collection main page id
        if (
            self.title == self.collection_info.title
            and self.page_id != self.collection_info.page_id
        ):
            await self.title_collection_helper.set_main_page_id(
                self.base_title, self.page_id
            )

        if self.page_index:
            await self.page_index.__aexit__(None, None, None)
        self.page_index = PageIndexHelper(self.dbs, self.collection_id)
        await self.page_index.__aenter__()
        await self.page_index.init_table()

    async def prepare_update_index(self):
        await self.update_title_index()

        page_content = await self.mwapi.parse_page(self.title)

        self.sentences = getWikiSentences(page_content)

        self.unindexed_docs = await self.page_index.get_unindexed_doc(
            self.sentences, self.page_id, with_temporary=False
        )

        return True

    async def get_unindexed_tokens(self):
        if self.unindexed_docs is None:
            return 0
        else:
            tokens = 0
            for doc in self.unindexed_docs:
                if "text" in doc:
                    tokens += await self.tiktoken.get_tokens(doc["text"])

            return tokens

    async def update_page_index(self, on_progress=None):
        if self.unindexed_docs is None:
            return False

        chunk_limit = 500

        chunk_len = 0
        doc_chunk = []
        total_token_usage = 0
        processed_len = 0

        async def on_embedding_progress(current, length):
            nonlocal processed_len

            indexed_docs = processed_len + current

            if on_progress is not None:
                await on_progress(indexed_docs, len(self.unindexed_docs))

        async def embedding_doc(doc_chunk):
            (doc_chunk, token_usage) = await self.openai_api.get_embeddings(
                doc_chunk, on_embedding_progress
            )
            await self.page_index.index_doc(doc_chunk, self.page_id)

            return token_usage

        if len(self.unindexed_docs) > 0:
            if on_progress is not None:
                await on_progress(0, len(self.unindexed_docs))

            for doc in self.unindexed_docs:
                chunk_len += len(doc)

                if chunk_len > chunk_limit:
                    total_token_usage += await embedding_doc(doc_chunk)
                    processed_len += len(doc_chunk)
                    if on_progress is not None:
                        await on_progress(processed_len, len(self.unindexed_docs))

                    doc_chunk = []
                    chunk_len = len(doc)

                doc_chunk.append(doc)

            if len(doc_chunk) > 0:
                total_token_usage += await embedding_doc(doc_chunk)
            if on_progress is not None:
                await on_progress(len(self.unindexed_docs), len(self.unindexed_docs))

            await self.page_index.remove_outdated_doc(self.sentences, self.page_id)

        # Update database
        # This task may take a long time, refresh model to retrieve latest data
        self.title_index = await self.title_index_helper.refresh(self.title_index)

        self.title_index.indexed_rev_id = self.page_info["lastrevid"]

        # Update title embedding
        if await self.title_index.awaitable_attrs.embedding is None:
            doc_chunk = [{"text": self.title}]
            (doc_chunk, token_usage) = await self.openai_api.get_embeddings(doc_chunk)
            total_token_usage += token_usage

            embedding = doc_chunk[0]["embedding"]
            self.title_index.embedding = embedding

        await self.title_index_helper.update(self.title_index)

        return total_token_usage

    async def search(
        self,
        query: str,
        limit: int = 10,
        in_collection: bool = False,
        distance_limit: float = 0.6,
    ):
        if self.page_index is None:
            raise Exception("Page index is not initialized")

        query_doc = [{"text": query}]
        query_doc, token_usage = await self.openai_api.get_embeddings(query_doc)
        query_embedding = query_doc[0]["embedding"]

        if query_embedding is None:
            return [], token_usage

        res = await self.page_index.search_text_embedding(
            query_embedding, in_collection, limit, self.page_id
        )
        if res:
            filtered = []
            for one in res:
                if one["distance"] < distance_limit:
                    filtered.append(dict(one))
            return filtered, token_usage
        else:
            return res, token_usage
重新创建项目 2 years ago			`from __future__ import annotations`
			`from typing import Optional, TypedDict`
增加在Collection中提问的功能 2 years ago
			`import sqlalchemy`
			`from api.model.embedding_search.title_collection import (`
			`TitleCollectionHelper,`
			`TitleCollectionModel,`
			`)`
将TitleIndexHelper改为ORM调用，修复编辑时报错 2 years ago			`from api.model.embedding_search.title_index import TitleIndexHelper, TitleIndexModel`
重新创建项目 2 years ago			`from api.model.embedding_search.page_index import PageIndexHelper`
			`from service.database import DatabaseService`
			`from service.mediawiki_api import MediaWikiApi`
			`from service.openai_api import OpenAIApi`
			`from service.tiktoken import TikTokenService`
			`from utils.wiki import getWikiSentences`

增加在Collection中提问的功能 2 years ago
更改流式输出模式 2 years ago			`class EmbeddingRunningException(Exception):`
			`pass`

增加在Collection中提问的功能 2 years ago
重新创建项目 2 years ago			`class EmbeddingSearchArgs(TypedDict):`
			`limit: Optional[int]`
			`in_collection: Optional[bool]`
			`distance_limit: Optional[float]`

增加在Collection中提问的功能 2 years ago
重新创建项目 2 years ago			`class EmbeddingSearchService:`
更改流式输出模式 2 years ago			`indexing_page_ids: list[int] = []`

重新创建项目 2 years ago			`def __init__(self, dbs: DatabaseService, title: str):`
			`self.dbs = dbs`

			`self.title = title`
			`self.base_title = title.split("/")[0]`

将TitleIndexHelper改为ORM调用，修复编辑时报错 2 years ago			`self.title_index_helper = TitleIndexHelper(dbs)`
			`self.title_collection_helper = TitleCollectionHelper(dbs)`
重新创建项目 2 years ago			`self.page_index: PageIndexHelper = None`

			`self.tiktoken: TikTokenService = None`

			`self.mwapi = MediaWikiApi.create()`
			`self.openai_api = OpenAIApi.create()`

将TitleIndexHelper改为ORM调用，修复编辑时报错 2 years ago			`self.page_id: Optional[int] = None`
			`self.collection_id: Optional[int] = None`
重新创建项目 2 years ago
增加在Collection中提问的功能 2 years ago			`self.title_index: Optional[TitleIndexModel] = None`
将TitleIndexHelper改为ORM调用，修复编辑时报错 2 years ago			`self.collection_info: Optional[TitleCollectionModel] = None`
重新创建项目 2 years ago
			`self.page_info: dict = None`
			`self.unindexed_docs: list = None`

			`async def __aenter__(self):`
			`self.tiktoken = await TikTokenService.create()`

将TitleIndexHelper改为ORM调用，修复编辑时报错 2 years ago			`await self.title_index_helper.__aenter__()`
			`await self.title_collection_helper.__aenter__()`
重新创建项目 2 years ago
增加在Collection中提问的功能 2 years ago			`self.title_index = await self.title_index_helper.find_by_title(self.title)`
			`if self.title_index is None:`
			`# Title may changed, get page info from page_id`
			`await self.load_page_info()`
			`self.title_index = await self.title_index_helper.find_by_page_id(`
			`self.page_info["pageid"]`
			`)`
			`self.page_id = self.page_info["pageid"]`
			`else:`
			`self.page_id = self.title_index.page_id`
			`self.collection_id = self.title_index.collection_id`
			`self.page_index = PageIndexHelper(self.dbs, self.collection_id)`
重新创建项目 2 years ago			`await self.page_index.__aenter__()`

			`return self`
增加在Collection中提问的功能 2 years ago
重新创建项目 2 years ago			`async def __aexit__(self, exc_type, exc, tb):`
将TitleIndexHelper改为ORM调用，修复编辑时报错 2 years ago			`await self.title_index_helper.__aexit__(exc_type, exc, tb)`
			`await self.title_collection_helper.__aexit__(exc_type, exc, tb)`
重新创建项目 2 years ago
			`if self.page_index is not None:`
			`await self.page_index.__aexit__(exc_type, exc, tb)`

增加在Collection中提问的功能 2 years ago			`async def page_index_exists(self, check_table=True):`
重新创建项目 2 years ago			`if check_table:`
			`return self.page_index and await self.page_index.table_exists()`
			`else:`
			`return self.page_index is not None`

			`async def load_page_info(self, reload=False):`
			`if self.page_info is None or reload:`
			`self.page_info = await self.mwapi.get_page_info(self.title)`

增加在Collection中提问的功能 2 years ago			`async def should_update_page_index(self, remote_update=False):`
			`if not remote_update:`
			`if self.title_index is None:`
			`return True`
			`return self.title_index.indexed_rev_id != self.title_index.latest_rev_id`
			`else:`
			`await self.load_page_info()`
重新创建项目 2 years ago
增加在Collection中提问的功能 2 years ago			`if (`
			`self.title_index is not None`
			`and await self.page_index_exists()`
			`and self.title_index.indexed_rev_id == self.page_info["lastrevid"]`
			`):`
			`# Not changed`
			`return False`
重新创建项目 2 years ago
增加在Collection中提问的功能 2 years ago			`return True`
重新创建项目 2 years ago
增加在Collection中提问的功能 2 years ago			`async def update_title_index(self, remote_update=False):`
			`if not await self.should_update_page_index(remote_update):`
重新创建项目 2 years ago			`return False`

增加在Collection中提问的功能 2 years ago			`await self.load_page_info()`

重新创建项目 2 years ago			`self.page_id = self.page_info["pageid"]`

更改流式输出模式 2 years ago			`if self.page_id in self.indexing_page_ids:`
			`raise EmbeddingRunningException("Page index is running now")`

增加在Collection中提问的功能 2 years ago			`self.title: str = self.page_info["title"]`
			`self.base_title = self.title.split("/")[0]`

			`# Find collection by base title`
			`self.collection_info = await self.title_collection_helper.find_by_title(`
			`self.base_title`
			`)`
重新创建项目 2 years ago			`if self.collection_info is None:`
增加在Collection中提问的功能 2 years ago			`# Create collection`
			`self.collection_info = await self.title_collection_helper.add(`
			`self.base_title`
			`)`
			`if self.collection_info is None:`
重新创建项目 2 years ago			`raise Exception("Failed to create title collection")`
增加在Collection中提问的功能 2 years ago			`self.collection_id = self.collection_info.id`

			`if self.title_index == None:`
			`# Create title record`
			`self.title_index = TitleIndexModel(`
			`title=self.page_info["title"],`
			`page_id=self.page_id,`
			`indexed_rev_id=None,`
			`latest_rev_id=self.page_info["lastrevid"],`
			`collection_id=self.collection_id,`
			`embedding=None,`
			`)`
			`self.title_index = await self.title_index_helper.add(self.title_index)`
			`if self.title_index is None:`
			`raise Exception("Failed to create title index")`
重新创建项目 2 years ago			`else:`
增加在Collection中提问的功能 2 years ago			`self.title_index.latest_rev_id = self.page_info["lastrevid"]`
			`# Title changed, remove embedding`
			`# Title sha1 will be updated by model helper`
			`if self.title_index.title != self.page_info["title"]:`
			`self.title_index.title = self.page_info["title"]`
			`self.title_index.embedding = None`

			`# Collection changed, remove old index`
			`if self.collection_id != self.title_index.collection_id:`
			`async with PageIndexHelper(self.dbs, self.title_index.collection_id) as old_page_index:`
修复刷新页面索引时的逻辑错误 2 years ago			`await old_page_index.init_table()`
增加在Collection中提问的功能 2 years ago			`old_page_index.remove_by_page_id(self.page_id)`
			`self.title_index.collection_id = self.collection_id`

			`await self.title_index_helper.update(self.title_index)`

			`# Update collection main page id`
			`if (`
			`self.title == self.collection_info.title`
			`and self.page_id != self.collection_info.page_id`
			`):`
			`await self.title_collection_helper.set_main_page_id(`
			`self.base_title, self.page_id`
			`)`
重新创建项目 2 years ago
增加在Collection中提问的功能 2 years ago			`if self.page_index:`
			`await self.page_index.__aexit__(None, None, None)`
			`self.page_index = PageIndexHelper(self.dbs, self.collection_id)`
重新创建项目 2 years ago			`await self.page_index.__aenter__()`
			`await self.page_index.init_table()`

增加在Collection中提问的功能 2 years ago			`async def prepare_update_index(self):`
			`await self.update_title_index()`

重新创建项目 2 years ago			`page_content = await self.mwapi.parse_page(self.title)`

			`self.sentences = getWikiSentences(page_content)`

增加在Collection中提问的功能 2 years ago			`self.unindexed_docs = await self.page_index.get_unindexed_doc(`
			`self.sentences, self.page_id, with_temporary=False`
			`)`
重新创建项目 2 years ago
			`return True`

			`async def get_unindexed_tokens(self):`
			`if self.unindexed_docs is None:`
			`return 0`
			`else:`
			`tokens = 0`
			`for doc in self.unindexed_docs:`
			`if "text" in doc:`
			`tokens += await self.tiktoken.get_tokens(doc["text"])`

			`return tokens`

			`async def update_page_index(self, on_progress=None):`
			`if self.unindexed_docs is None:`
			`return False`

增加noawait，支持Azure API 2 years ago			`chunk_limit = 500`

			`chunk_len = 0`
			`doc_chunk = []`
重新创建项目 2 years ago			`total_token_usage = 0`
增加noawait，支持Azure API 2 years ago			`processed_len = 0`

			`async def on_embedding_progress(current, length):`
			`nonlocal processed_len`

			`indexed_docs = processed_len + current`

			`if on_progress is not None:`
			`await on_progress(indexed_docs, len(self.unindexed_docs))`
重新创建项目 2 years ago
			`async def embedding_doc(doc_chunk):`
增加在Collection中提问的功能 2 years ago			`(doc_chunk, token_usage) = await self.openai_api.get_embeddings(`
			`doc_chunk, on_embedding_progress`
			`)`
			`await self.page_index.index_doc(doc_chunk, self.page_id)`
重新创建项目 2 years ago
			`return token_usage`

			`if len(self.unindexed_docs) > 0:`
			`if on_progress is not None:`
			`await on_progress(0, len(self.unindexed_docs))`
增加在Collection中提问的功能 2 years ago
重新创建项目 2 years ago			`for doc in self.unindexed_docs:`
			`chunk_len += len(doc)`

			`if chunk_len > chunk_limit:`
			`total_token_usage += await embedding_doc(doc_chunk)`
			`processed_len += len(doc_chunk)`
			`if on_progress is not None:`
			`await on_progress(processed_len, len(self.unindexed_docs))`

			`doc_chunk = []`
			`chunk_len = len(doc)`

			`doc_chunk.append(doc)`

			`if len(doc_chunk) > 0:`
			`total_token_usage += await embedding_doc(doc_chunk)`
			`if on_progress is not None:`
			`await on_progress(len(self.unindexed_docs), len(self.unindexed_docs))`

增加在Collection中提问的功能 2 years ago			`await self.page_index.remove_outdated_doc(self.sentences, self.page_id)`
重新创建项目 2 years ago
			`# Update database`
增加在Collection中提问的功能 2 years ago			`# This task may take a long time, refresh model to retrieve latest data`
			`self.title_index = await self.title_index_helper.refresh(self.title_index)`

			`self.title_index.indexed_rev_id = self.page_info["lastrevid"]`

			`# Update title embedding`
			`if await self.title_index.awaitable_attrs.embedding is None:`
重新创建项目 2 years ago			`doc_chunk = [{"text": self.title}]`
			`(doc_chunk, token_usage) = await self.openai_api.get_embeddings(doc_chunk)`
			`total_token_usage += token_usage`

			`embedding = doc_chunk[0]["embedding"]`
增加在Collection中提问的功能 2 years ago			`self.title_index.embedding = embedding`
重新创建项目 2 years ago
增加在Collection中提问的功能 2 years ago			`await self.title_index_helper.update(self.title_index)`
重新创建项目 2 years ago
			`return total_token_usage`

增加在Collection中提问的功能 2 years ago			`async def search(`
			`self,`
			`query: str,`
			`limit: int = 10,`
			`in_collection: bool = False,`
			`distance_limit: float = 0.6,`
			`):`
重新创建项目 2 years ago			`if self.page_index is None:`
			`raise Exception("Page index is not initialized")`

			`query_doc = [{"text": query}]`
			`query_doc, token_usage = await self.openai_api.get_embeddings(query_doc)`
			`query_embedding = query_doc[0]["embedding"]`

			`if query_embedding is None:`
			`return [], token_usage`

增加在Collection中提问的功能 2 years ago			`res = await self.page_index.search_text_embedding(`
			`query_embedding, in_collection, limit, self.page_id`
			`)`
重新创建项目 2 years ago			`if res:`
			`filtered = []`
			`for one in res:`
			`if one["distance"] < distance_limit:`
			`filtered.append(dict(one))`
			`return filtered, token_usage`
			`else:`
			`return res, token_usage`